Dylan Rohan - a1844790
# Loading modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# Selecting and preparing data
from sklearn import datasets
wine_df = pd.DataFrame(datasets.load_wine().data)
wine_df.columns = datasets.load_wine().feature_names
wine_class = datasets.load_wine().target
wine_df.insert(0, 'Class', wine_class)
wine_df.head()
| Class | alcohol | malic_acid | ash | alcalinity_of_ash | magnesium | total_phenols | flavanoids | nonflavanoid_phenols | proanthocyanins | color_intensity | hue | od280/od315_of_diluted_wines | proline | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 |
| 1 | 0 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 |
| 2 | 0 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 |
| 3 | 0 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 |
| 4 | 0 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 |
The dataset I have selected contains 13 attributes of 178 wines and appears free of missing data. It is usually used as a practice dataset for a classifier model, but I will be using it to compare the populations of the three classes of wine on a given attribute. The class indicate the cultivator of the wine, but all three come from the same region in Italy.
# Gathering information about the dataset
print("Missing Data:\n", wine_df.isna().sum())
print("Shape of dataframe: \n", wine_df.shape, "\n")
print("Attributes: \n", wine_df.columns, '\n')
print("Min values: \n", wine_df.min(), '\n')
print("Max values: \n", wine_df.max(), '\n')
Missing Data:
Class 0
alcohol 0
malic_acid 0
ash 0
alcalinity_of_ash 0
magnesium 0
total_phenols 0
flavanoids 0
nonflavanoid_phenols 0
proanthocyanins 0
color_intensity 0
hue 0
od280/od315_of_diluted_wines 0
proline 0
dtype: int64
Shape of dataframe:
(178, 14)
Attributes:
Index(['Class', 'alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash',
'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
'proanthocyanins', 'color_intensity', 'hue',
'od280/od315_of_diluted_wines', 'proline'],
dtype='object')
Min values:
Class 0.00
alcohol 11.03
malic_acid 0.74
ash 1.36
alcalinity_of_ash 10.60
magnesium 70.00
total_phenols 0.98
flavanoids 0.34
nonflavanoid_phenols 0.13
proanthocyanins 0.41
color_intensity 1.28
hue 0.48
od280/od315_of_diluted_wines 1.27
proline 278.00
dtype: float64
Max values:
Class 2.00
alcohol 14.83
malic_acid 5.80
ash 3.23
alcalinity_of_ash 30.00
magnesium 162.00
total_phenols 3.88
flavanoids 5.08
nonflavanoid_phenols 0.66
proanthocyanins 3.58
color_intensity 13.00
hue 1.71
od280/od315_of_diluted_wines 4.00
proline 1680.00
dtype: float64
Below I explore the data using multiple violin plots and do a bit of statistical analyses to determine whether any of the attributes are significanlty different among the three classes using an anova. It was found that the three populations of wine were statistically significant below the 0.05 threshold in all attributes. This means that depending on which attribute you find most important in an alcoholic drink, one of these cultivators of wine would be better suited to your tastes than the others.
(The default hover seems to supply all the information one might want, but it can be altered with the hovertemplate attribute)
# Demonstarting how you can quickly make subplots to view all attributes
#### UNITS NOT PROVIDED IN DOCUMENTION ####
# Creatuing list of attributes of interest to loop through
attributes = ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols',
'flavanoids', 'nonflavanoid_phenols','proanthocyanins', 'color_intensity', 'hue',
'od280/od315_of_diluted_wines', 'proline']
# Providing subtitles
subtitles = []
for attribute in attributes:
subtitles.append(attribute + " in each class of wine")
Subtitles = []
for string in subtitles:
Subtitles.append(string.title())
# Creating a plotly figure object
wine_fig = make_subplots(
rows = len(attributes),
cols = 1,
subplot_titles = Subtitles,
vertical_spacing = 0.02)
# Looping figures to generate each subplot
i = 1
for attribute in attributes:
wine_fig.add_trace(
go.Violin(x = wine_df['Class'], y = wine_df[attribute],
box_visible=True,
name = attribute), #If you want a legend
row = i, col=1,
)
i=i+1
# Cleaning the figure up
wine_fig.update_layout(height=5000, width=800,
title_text="Violin Subplots For Each Attribute",
showlegend=False)
i = 1
for attribute in attributes:
wine_fig.update_xaxes(title_text="Class", row = i, col=1) # Adding x-axis label
wine_fig.update_yaxes(title_text= attribute.title() + " Content", row = i, col=1) # Adding y-axis label
i = i+1
wine_fig
# Demonstrating other forms the violin plot can have on plotly
# With scatter points
wine_fig2 = px.violin(wine_df, x = 'Class', y = 'alcohol', color = 'Class', points='all')
wine_fig2.update_layout(height=500, width=800,
title_text="Violin plot with associated alcohol instances",
showlegend=False)
# Overlayed, you can click the legend to remove layers
wine_fig3 = px.violin(wine_df, x = 'alcohol', color = 'Class', violinmode='overlay')
wine_fig3.update_layout(height=500, width=800,
title_text="Overlayed violin plots for the alcohol content of each Class of wine",
showlegend=True)
# One-way ANOVA
count = 1
for i in ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash','magnesium', 'total_phenols', 'flavanoids',
'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']:
fstat, p_value = stats.f_oneway(wine_df[i][wine_df['Class'] == 0],
wine_df[i][wine_df['Class'] == 1],
wine_df[i][wine_df['Class'] == 2])
f_crit = stats.f.ppf(q=1-.005, dfn=3-1, dfd=178-3)
if (p_value <= 0.05)==True & (fstat > f_crit)==True:
print(count,") When comparing the ", i , " in/for each class, the classes were statistically different:\n", "p-value =", p_value)
print("F-critical = ",f_crit,", f-stat =", fstat)
print()
count = count + 1
1 ) When comparing the alcohol in/for each class, the classes were statistically different: p-value = 3.319503795619655e-36 F-critical = 5.462017136271799 , f-stat = 135.07762424279912 2 ) When comparing the malic_acid in/for each class, the classes were statistically different: p-value = 4.127228798041834e-14 F-critical = 5.462017136271799 , f-stat = 36.94342496318366 3 ) When comparing the ash in/for each class, the classes were statistically different: p-value = 4.149967974793239e-06 F-critical = 5.462017136271799 , f-stat = 13.312901199991039 4 ) When comparing the alcalinity_of_ash in/for each class, the classes were statistically different: p-value = 9.444472938826817e-14 F-critical = 5.462017136271799 , f-stat = 35.77163740730924 5 ) When comparing the magnesium in/for each class, the classes were statistically different: p-value = 8.963395439251042e-06 F-critical = 5.462017136271799 , f-stat = 12.4295843381499 6 ) When comparing the total_phenols in/for each class, the classes were statistically different: p-value = 2.1376700154385954e-28 F-critical = 5.462017136271799 , f-stat = 93.73300962036718 7 ) When comparing the flavanoids in/for each class, the classes were statistically different: p-value = 3.5985858307136404e-50 F-critical = 5.462017136271799 , f-stat = 233.92587268154935 8 ) When comparing the nonflavanoid_phenols in/for each class, the classes were statistically different: p-value = 3.88804090047893e-11 F-critical = 5.462017136271799 , f-stat = 27.575417146965872 9 ) When comparing the proanthocyanins in/for each class, the classes were statistically different: p-value = 5.125358737546706e-12 F-critical = 5.462017136271799 , f-stat = 30.27138317022762 10 ) When comparing the color_intensity in/for each class, the classes were statistically different: p-value = 1.162008021927618e-33 F-critical = 5.462017136271799 , f-stat = 120.66401844100312 11 ) When comparing the hue in/for each class, the classes were statistically different: p-value = 5.9176622163620664e-30 F-critical = 5.462017136271799 , f-stat = 101.31679539030002 12 ) When comparing the od280/od315_of_diluted_wines in/for each class, the classes were statistically different: p-value = 1.393104956942896e-44 F-critical = 5.462017136271799 , f-stat = 189.97232057888917 13 ) When comparing the proline in/for each class, the classes were statistically different: p-value = 5.783168356105498e-47 F-critical = 5.462017136271799 , f-stat = 207.92037390217774
planets_df = sns.load_dataset('planets')
planets_df.head()
# This dataset seems to have a list of observations/ calculations for a planet when using a specific method.
| method | number | orbital_period | mass | distance | year | |
|---|---|---|---|---|---|---|
| 0 | Radial Velocity | 1 | 269.300 | 7.10 | 77.40 | 2006 |
| 1 | Radial Velocity | 1 | 874.774 | 2.21 | 56.95 | 2008 |
| 2 | Radial Velocity | 1 | 763.000 | 2.60 | 19.84 | 2011 |
| 3 | Radial Velocity | 1 | 326.030 | 19.40 | 110.62 | 2007 |
| 4 | Radial Velocity | 1 | 516.220 | 10.50 | 119.47 | 2009 |
order =["Radial Velocity", "Imaging", "Eclipse Timing Variations", "Transit", "Astrometry",
"Transit Timing Variations", "Orbital Brightness Modulation", "Microlensing", "Pulsar Timing",
"Pulsation Timing Variations"]
copied_fig = plt.figure(figsize = (8,8))
ax = sns.boxplot(data = planets_df, # selecting the data
x = 'distance',
y = 'method',
orient = 'h', # making horizontl boxplots
palette = 'vlag', saturation = 0.7, # getting the colour right
order = order, # setting the order to match that in the picture
whis =200)
# Setting axis to match source image
ax.set_xscale('log')
ax.semilogx()
# swarmplot
sns.swarmplot(data = planets_df,
x = 'distance',
y = 'method',
color = "black",
orient = 'h',
size = 2.2,
order = order,
) # including the swarm plot as seen in picture
ax.xaxis.grid(True)
sns.despine(left =True)
ax.set_ylabel(None)
ax.set_xlabel("distance")
plt.axis('auto')
C:\Users\rohad\anaconda3\lib\site-packages\seaborn\categorical.py:1296: UserWarning: 10.7% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
(0.8717250189453656, 13163.554734132505, 8.5, -0.5)